{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "# Related keywords detection"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**NOTE**: This notebook depends upon the the Retrotech dataset. If you have any issues, please rerun the [Setting up the Retrotech Dataset](../ch04/1.setting-up-the-retrotech-dataset.ipynb) notebook."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### BONUS [Not in Chaper]: Using NLP approach  \n",
    "\n",
    "\n",
    "### Step 1: data cleaning\n",
    "perform minimum stemming on queries and drop rare queries with count of 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: gensim in /opt/conda/lib/python3.7/site-packages (4.1.2)\r\n",
      "Requirement already satisfied: smart-open>=1.8.1 in /opt/conda/lib/python3.7/site-packages (from gensim) (5.2.1)\r\n",
      "Requirement already satisfied: numpy>=1.17.0 in /opt/conda/lib/python3.7/site-packages (from gensim) (1.19.0)\r\n",
      "Requirement already satisfied: scipy>=0.18.1 in /opt/conda/lib/python3.7/site-packages (from gensim) (1.7.2)\r\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "!pip install gensim\n",
    "import pandas \n",
    "import numpy\n",
    "from tqdm import tqdm\n",
    "import nltk\n",
    "nltk.download('wordnet')\n",
    "from nltk.corpus import wordnet \n",
    "import re\n",
    "from nltk.tokenize import RegexpTokenizer\n",
    "from gensim.models import Word2Vec\n",
    "from collections import defaultdict\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.neighbors import NearestNeighbors\n",
    "from pyspark.sql import SparkSession\n",
    "spark = SparkSession.builder.appName(\"aips-ch6\").getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "#TG to MK: Can you switch this to use the dataframe instead of the CSV, please? Data is already indexed into Solr.\n",
    "csvFile = \"../../data/retrotech/products.csv\"\n",
    "import csv\n",
    "reader = csv.reader(open(csvFile,'r'))\n",
    "products = []\n",
    "prod2 = []\n",
    "for r in reader:\n",
    "    if len(r)> 5:\n",
    "        prod2.append(r)\n",
    "        continue\n",
    "    products.append(r)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['upc', 'name', 'manufacturer', 'short_description', 'long_description'],\n",
       " ['096009010836', 'Fists of Bruce Lee - Dolby - DVD', '\\\\N', '\\\\N', '\\\\N']]"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# len(products), len(prod2)\n",
    "products[0:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'aggr_signals' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-97-d3fd6808fb26>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0maggr_signals\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maggr_signals\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0maggr_signals\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"count\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0maggr_signals\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'aggr_signals' is not defined"
     ]
    }
   ],
   "source": [
    "aggr_signals = aggr_signals[aggr_signals[\"count\"] > 1]\n",
    "aggr_signals.shape[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2: find synonym from wordnet\n",
    "get candidate from wordnet if both query and candidate are included in wordnet. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'aggr_signals' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-98-38afd5779e3f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mqueries\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mquery\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maggr_signals\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"query\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0msynset\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mwordnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msynsets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mlemma\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msynset\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlemmas\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'aggr_signals' is not defined"
     ]
    }
   ],
   "source": [
    "synonym = []\n",
    "queries = []\n",
    "\n",
    "for query in tqdm(aggr_signals[\"query\"]):\n",
    "    for synset in wordnet.synsets(query):\n",
    "        for lemma in synset.lemmas():\n",
    "            candidate = lemma.name().replace(\"_\", \" \")\n",
    "            if candidate in list(aggr_signals[\"query\"]) and candidate != query:\n",
    "                queries.append(query)\n",
    "                synonym.append(candidate)\n",
    "                \n",
    "wordnet_result = pandas.DataFrame({\"queries\":queries,\"synonym\":synonym}).drop_duplicates()\n",
    "wordnet_result.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 3: find synonym from word2vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# combine queries and product descriptions to be fed into word2vec model\n",
    "tokenizer = RegexpTokenizer(r'\\w+')\n",
    "tokenized_description = [tokenizer.tokenize(text.lower()) for text in product_description.long_description]\n",
    "tokenized_query = [tokenizer.tokenize(text) for text in aggr_signals[\"query\"]]\n",
    "tokenized_all = tokenized_query + tokenized_description"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# compute word2vec for each word\n",
    "model = Word2Vec(tokenized_all, vector_size=150, window=8, min_count=1, workers=-1)\n",
    "word2vec = dict(zip(model.wv.index_to_key, model.wv.vectors))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# build word weights dictionary to weigh word vectors, so that rare words get more weights\n",
    "\n",
    "tfidf = TfidfVectorizer(analyzer=lambda x: x)\n",
    "tfidf.fit(tokenized_all)\n",
    "\n",
    "max_idf = max(tfidf.idf_) #the default idf is the max of idf's for unseen words\n",
    "weights = defaultdict(lambda: max_idf, [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query_vectors = []\n",
    "tokenized_query_clean = [x for x in tokenized_query if x != []]\n",
    "queries_cleaned = [\" \".join(x) for x in tokenized_query_clean]\n",
    "\n",
    "for tokens in tokenized_query_clean:\n",
    "    tmp = []\n",
    "    for token in tokens:\n",
    "        if token in word2vec.keys():\n",
    "            tmp.append(word2vec[token] * weights[token])\n",
    "        else:\n",
    "            tmp.append(numpy.zeros(150))\n",
    "    query_vectors.append(numpy.mean(tmp, axis=0).tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "knn = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute', n_jobs=-1)\n",
    "knn.fit(numpy.stack(query_vectors))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "queries = []\n",
    "synonym = []\n",
    "cosine = []\n",
    "\n",
    "for i in tqdm(range(len(queries_cleaned))):\n",
    "    synonym_candidates = knn.kneighbors(numpy.expand_dims(query_vectors[i],axis=0), n_neighbors=6)\n",
    "    query_index = synonym_candidates[1][0].tolist()[1:] #drop first candidate which is the same of the original query\n",
    "    cosine_similarity = [1-x for x in synonym_candidates[0][0].tolist()[1:]]\n",
    "    query = queries_cleaned[i]\n",
    "    for j in range(0,5):\n",
    "        queries.append(query)\n",
    "        synonym.append(queries_cleaned[query_index[j]])\n",
    "        cosine.append(cosine_similarity[j])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "synonym_candidates = pandas.DataFrame({\"queries\":queries, \"synonym\":synonym, \"cosine\":cosine})\n",
    "synonym_candidates.head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# apply cosine similarity threshold to further filter candidate synonyms\n",
    "cosine_threshold = 0.7\n",
    "word2vec_result = synonym_candidates[synonym_candidates.cosine >= cosine_threshold][[\"queries\",\"synonym\"]]\n",
    "word2vec_result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 4: combine wordnet and word2vec lists\n",
    "the pair query A and synonym B is the same as the pair query B and synonym A, sort query and synonym to keep only 1 pair."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "combined_result = wordnet_result.append(word2vec_result)\n",
    "\n",
    "def sort_pair (row): \n",
    "    return \"_\".join(sorted([row['queries'], row['synonym']])) \n",
    "\n",
    "combined_result[\"sorted_pair\"] = combined_result.apply(lambda row: sort_pair(row), axis=1)\n",
    "final_result = combined_result.groupby(\"sorted_pair\").first().reset_index().drop([\"sorted_pair\"],axis=1)\n",
    "len(final_result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_result"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
